{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python391jvsc74a57bd0b8f714736076dee7cc4e7cd002fa6517bd2bcb67ffacfab068ce43d1637242d6",
   "display_name": "Python 3.9.1 64-bit ('base': conda)"
  },
  "metadata": {
   "interpreter": {
    "hash": "b8f714736076dee7cc4e7cd002fa6517bd2bcb67ffacfab068ce43d1637242d6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "class_dict = {'OBJECTIVE':0, 'METHODS':1, 'RESULTS':2, 'CONCLUSIONS':3,'BACKGROUND':4}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('result_move.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1485b4cd0>"
      ]
     },
     "metadata": {},
     "execution_count": 13
    }
   ],
   "source": [
    "df.groupby(['id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1485b4640>\n"
     ]
    }
   ],
   "source": [
    "print(df.groupby(['id']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "ids = list(set(df['id'].to_list()))\n",
    "test_ids = random.sample(ids,500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ids = [i for i in ids if i not in test_ids]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "7004"
      ]
     },
     "metadata": {},
     "execution_count": 17
    }
   ],
   "source": [
    "len(train_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "       Unnamed: 0     id         move  \\\n",
       "0               0      6    OBJECTIVE   \n",
       "1               1      6      METHODS   \n",
       "2               2      6      RESULTS   \n",
       "3               3      6  CONCLUSIONS   \n",
       "4               4      8    OBJECTIVE   \n",
       "...           ...    ...          ...   \n",
       "32096       32096  51155   BACKGROUND   \n",
       "32097       32097  51155    OBJECTIVE   \n",
       "32098       32098  51155      METHODS   \n",
       "32099       32099  51155      RESULTS   \n",
       "32100       32100  51155  CONCLUSIONS   \n",
       "\n",
       "                                                    text  \n",
       "0      The aims of this study is to investigate the m...  \n",
       "1      Carbonate apatite as endodontic sealer materia...  \n",
       "2      Behavioral test shows impaired motor function ...  \n",
       "3      The implantation of carbonate apatite material...  \n",
       "4      Insulin access for people with diabetes is a g...  \n",
       "...                                                  ...  \n",
       "32096  The use of controlled vocabularies (CVs) aims ...  \n",
       "32097  To identify and classify the type of CVs, and ...  \n",
       "32098  A systematic mapping study, collecting empiric...  \n",
       "32099  This work identified 2348 papers published per...  \n",
       "32100  The evolution of the last 10 years in the numb...  \n",
       "\n",
       "[29979 rows x 4 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0</th>\n      <th>id</th>\n      <th>move</th>\n      <th>text</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>6</td>\n      <td>OBJECTIVE</td>\n      <td>The aims of this study is to investigate the m...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>6</td>\n      <td>METHODS</td>\n      <td>Carbonate apatite as endodontic sealer materia...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>6</td>\n      <td>RESULTS</td>\n      <td>Behavioral test shows impaired motor function ...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>6</td>\n      <td>CONCLUSIONS</td>\n      <td>The implantation of carbonate apatite material...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>8</td>\n      <td>OBJECTIVE</td>\n      <td>Insulin access for people with diabetes is a g...</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>32096</th>\n      <td>32096</td>\n      <td>51155</td>\n      <td>BACKGROUND</td>\n      <td>The use of controlled vocabularies (CVs) aims ...</td>\n    </tr>\n    <tr>\n      <th>32097</th>\n      <td>32097</td>\n      <td>51155</td>\n      <td>OBJECTIVE</td>\n      <td>To identify and classify the type of CVs, and ...</td>\n    </tr>\n    <tr>\n      <th>32098</th>\n      <td>32098</td>\n      <td>51155</td>\n      <td>METHODS</td>\n      <td>A systematic mapping study, collecting empiric...</td>\n    </tr>\n    <tr>\n      <th>32099</th>\n      <td>32099</td>\n      <td>51155</td>\n      <td>RESULTS</td>\n      <td>This work identified 2348 papers published per...</td>\n    </tr>\n    <tr>\n      <th>32100</th>\n      <td>32100</td>\n      <td>51155</td>\n      <td>CONCLUSIONS</td>\n      <td>The evolution of the last 10 years in the numb...</td>\n    </tr>\n  </tbody>\n</table>\n<p>29979 rows × 4 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 18
    }
   ],
   "source": [
    "df[df['id'].isin(train_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = df[-df['id'].isin(test_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "<ipython-input-20-4a6b13b24afc>:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  train_df['move'] = train_df['move'].map(class_dict)\n"
     ]
    }
   ],
   "source": [
    "train_df['move'] = train_df['move'].map(class_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "       Unnamed: 0     id  move  \\\n",
       "0               0      6     0   \n",
       "1               1      6     1   \n",
       "2               2      6     2   \n",
       "3               3      6     3   \n",
       "4               4      8     0   \n",
       "...           ...    ...   ...   \n",
       "32096       32096  51155     4   \n",
       "32097       32097  51155     0   \n",
       "32098       32098  51155     1   \n",
       "32099       32099  51155     2   \n",
       "32100       32100  51155     3   \n",
       "\n",
       "                                                    text  \n",
       "0      The aims of this study is to investigate the m...  \n",
       "1      Carbonate apatite as endodontic sealer materia...  \n",
       "2      Behavioral test shows impaired motor function ...  \n",
       "3      The implantation of carbonate apatite material...  \n",
       "4      Insulin access for people with diabetes is a g...  \n",
       "...                                                  ...  \n",
       "32096  The use of controlled vocabularies (CVs) aims ...  \n",
       "32097  To identify and classify the type of CVs, and ...  \n",
       "32098  A systematic mapping study, collecting empiric...  \n",
       "32099  This work identified 2348 papers published per...  \n",
       "32100  The evolution of the last 10 years in the numb...  \n",
       "\n",
       "[29979 rows x 4 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0</th>\n      <th>id</th>\n      <th>move</th>\n      <th>text</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>6</td>\n      <td>0</td>\n      <td>The aims of this study is to investigate the m...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>6</td>\n      <td>1</td>\n      <td>Carbonate apatite as endodontic sealer materia...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>6</td>\n      <td>2</td>\n      <td>Behavioral test shows impaired motor function ...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>6</td>\n      <td>3</td>\n      <td>The implantation of carbonate apatite material...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>8</td>\n      <td>0</td>\n      <td>Insulin access for people with diabetes is a g...</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>32096</th>\n      <td>32096</td>\n      <td>51155</td>\n      <td>4</td>\n      <td>The use of controlled vocabularies (CVs) aims ...</td>\n    </tr>\n    <tr>\n      <th>32097</th>\n      <td>32097</td>\n      <td>51155</td>\n      <td>0</td>\n      <td>To identify and classify the type of CVs, and ...</td>\n    </tr>\n    <tr>\n      <th>32098</th>\n      <td>32098</td>\n      <td>51155</td>\n      <td>1</td>\n      <td>A systematic mapping study, collecting empiric...</td>\n    </tr>\n    <tr>\n      <th>32099</th>\n      <td>32099</td>\n      <td>51155</td>\n      <td>2</td>\n      <td>This work identified 2348 papers published per...</td>\n    </tr>\n    <tr>\n      <th>32100</th>\n      <td>32100</td>\n      <td>51155</td>\n      <td>3</td>\n      <td>The evolution of the last 10 years in the numb...</td>\n    </tr>\n  </tbody>\n</table>\n<p>29979 rows × 4 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 21
    }
   ],
   "source": [
    "train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}